library(tidyverse)
library(tidymodels)
library(workflows)
library(magrittr)
source('data_retrieval.R')
source('preprocess.R')
source('modeling.R')
source('data_retrieval.R')

rm(roc_auc)

revision_cancer_events <- get_revision_cancer_events_data(con) %>% 
  filter(EVE_event_number <= 7 )   %>% 
  do_intensity()

set.seed(42)
(revision_cancer_events_split <- 
  revision_cancer_events  
  %>% add_UTL_features()
  %>% eol_cleaning_pipeline()
  %>% join_phat0(file_name = "revision_p0.RData")
  %>% create_hashed_split(0.5)
)

calibration_proportion <- get_calibration_proportion(revision_cancer_events_split$train, DMG_died_within_365d)
downsampled_train <- downsample_data(revision_cancer_events_split$train, DMG_died_within_365d)

(initial_model_params <- list(tree_depth = 8, learn_rate = 0.123001965989824,
                              loss_reduction = 11.1582974204794, min_n = 2.44744561146945,
                              sample_size = 0.773196286475286,
                              mtry = 0.549052911438048, trees = 400)
)

tic <- Sys.time() 
best_params <- run_bayes_optimisation(downsampled_train, DMG_died_within_365d, initial_model_params, 50) %>%
               select_best() %>% 
               select(-.config)


# train model 
print(paste0("Hyperparamter search took ", difftime(Sys.time(), tic, units = "hours"), " hours!"))
best_model_spec <- get_xgboost_spec() %>% set_args(nthread = 40, !!!best_params)

toc <- Sys.time()
(downsampled_train 
  %>% train_model(best_model_spec, outcome = DMG_died_within_365d)
  -> xgboost_fitted
)

print(paste0("Model training took ", difftime(Sys.time(), toc, units = "hours"), " hours!"))

preds_test <- get_model_preds(model = xgboost_fitted, 
                              data = revision_cancer_events_split$test, 
                              true_outcome = "DMG_died_within_365d")


preds_test_calibrate <- calibrate_preds(preds_test, calibration_proportion, .pred_1)

roc_auc <- roc_auc(preds_test_calibrate , truth = true_value, .pred_1)

feature_importance <- get_feature_importance(xgboost_fitted)
fit_xgboost <- pull_workflow_fit(xgboost_fitted)

cancer_revision_events_model = list(fit_xgboost = fit_xgboost,
                                    preds_test = preds_test_calibrate,
                                    roc_auc = roc_auc, 
                                    feature_importance = feature_importance)

save(cancer_revision_events_model,
     file = "revision_events_intensity.RData")




##### train AUC : 


initial_predict <- get_load_new(file_name = "revision_events_intensity.RData")

recipe <- make_matrix_recipe(downsampled_train, DMG_died_within_365d) %>% prep()
train_data_preprocessed <- recipe %>% juice()

preds <- predict(initial_predict$fit_xgboost,
                 train_data_preprocessed %>% select(-DMG_died_within_365d), 
                 type = "prob")

preds_train_calibrate <- calibrate_preds(preds, calibration_proportion, .pred_1)

roc_auc_train <- roc_auc(preds_train_calibrate ,
                         truth = train_data_preprocessed$DMG_died_within_365d,
                         preds_after_bayes)

rbind(data.frame(initial_predict$roc_auc) %>% mutate(split = "test"), 
      data.frame(roc_auc_train) %>% mutate(split = "train")) %>%
  mutate(sample = "Events") %>% 
  write_csv( .,"revision_events_AUC.csv")


